library(vcfR)
## Warning: package 'vcfR' was built under R version 3.5.2
## 
##    *****       ***   vcfR   ***       *****
##    This is vcfR 1.9.0 
##      browseVignettes('vcfR') # Documentation
##      citation('vcfR') # Citation
##    *****       *****      *****       *****
library(ggplot2)
library(gridExtra)
library(ggridges)
## Warning: package 'ggridges' was built under R version 3.5.2
library(adegenet)
## Loading required package: ade4
## Warning: package 'ade4' was built under R version 3.5.2
## 
##    /// adegenet 2.1.3 is loaded ////////////
## 
##    > overview: '?adegenet'
##    > tutorials/doc/questions: 'adegenetWeb()' 
##    > bug reports/feature requests: adegenetIssues()

RADStacksHelpR

#do hard filtering with this function:
hard.filter.vcf <- function(vcfR, depth=NULL, gq=NULL){
  
  #extract depth from the vcf
  dp.matrix<- extract.gt(vcfR, element='DP', as.numeric=TRUE)
  
  #calculate the SNPs that fall below the depth filter
  i<-round((sum(dp.matrix < depth, na.rm = TRUE)/sum(!is.na(dp.matrix)))*100, 2)
  #report filter
  print(paste0(i,"% of genotypes fall below a read depth of ",depth," and were converted to NA"))
  
  #convert to NAs
  dp.matrix[dp.matrix < depth] <- NA
  vcfR@gt[,-1][ is.na(dp.matrix) == TRUE ] <- NA
  
  #extract gq from the vcf
  gq.matrix<- extract.gt(vcfR, element='GQ', as.numeric=TRUE)
  
  #calculate the SNPs that fall below the gq filter
  j<-round((sum(gq.matrix < gq, na.rm = TRUE)/sum(!is.na(gq.matrix)))*100, 2)
  #report filter
  print(paste0(j,"% of genotypes fall below a genotype quality of ",gq," and were converted to NA"))
  
  #convert to NAs
  gq.matrix[gq.matrix < gq] <- NA
  vcfR@gt[,-1][ is.na(gq.matrix) == TRUE ] <- NA
  
  return(vcfR)
}

filter.allele.balance <- function(vcfR){
  
  #extract AD from the vcf
  ad.matrix<- extract.gt(vcfR, element='AD')
  #extract GT from the vcf
  gt.matrix<- extract.gt(vcfR, element='GT')
  
  #mask dp matrix to include only called hets from gt matrix
  ad.matrix[gt.matrix != "0/1"]<-NA
  
  #split allele 1 depth from allele 2 depth
  al1<-structure(as.numeric(gsub(",.*", "", ad.matrix)), dim=dim(ad.matrix))
  al2<-structure(as.numeric(gsub(".*,", "", ad.matrix)), dim=dim(ad.matrix))
  
  #calculate percentage of hets failing AB filter
  AB<-al1/(al1 + al2) > .75 | al1/(al1 + al2) <.25
  p<-round(sum(AB, na.rm = TRUE) / sum(is.na(AB) == FALSE)*100, 2)
  j<-round(sum(AB, na.rm = TRUE) / sum(is.na(gt.matrix) == FALSE)*100, 2)
  
  print(paste0(p,"% of het genotypes (",j,"% of all genotypes) fall outside of .25 - .75 allele balance and were converted to NA"))
  
  #convert failing genotypes to NA
  vcfR@gt[,-1][AB]<-NA
  
  return(vcfR)
}

#function to vis and filter by maximum depth
max.depth <- function(vcfR, maxdepth=NULL){
  
  #extract depth from the vcf
  dp.matrix<- extract.gt(vcfR, element='DP', as.numeric=TRUE)
  
  #calculate vector of depth per SNP
  snpdepth<-rowSums(dp.matrix, na.rm = TRUE)/rowSums(is.na(dp.matrix) == FALSE)
    if (is.null(maxdepth)){

    #plot histogram of depth
    hist(snpdepth, xlab = "mean of the depth of all samples successfully genotyped at a given SNP", main =NULL)
    abline(v=mean(snpdepth, na.rm = TRUE), col="red", lty="dashed")
    
    #zoomed in histogram
    hist(snpdepth[snpdepth < 200], xlab = "mean of the depth of all samples successfully genotyped at a given SNP", main ="visualize distribution of SNPs below a depth of 200")
    abline(v=mean(snpdepth, na.rm = TRUE), col="red", lty="dashed")
    
    #print
    print(paste0("dashed line indicates a mean depth across all SNPs of ",round(mean(snpdepth, na.rm = TRUE),1)))

    }
  
      else {
      #plot the maxdepth cutoff
      hist(snpdepth, xlab = "mean of the depth of all samples successfully genotyped at a given SNP", main ="max depth cutoff")
      abline(v=maxdepth, col="red")
      
      #calculate % of snps that fail the max depth filter
      i<-round(sum(snpdepth > maxdepth, na.rm = TRUE)/length(snpdepth)*100, 2)

      print(paste0(i, "% of SNPs were above a mean depth of ", maxdepth, " and were removed from the vcf"))
      
      #filter vcf
      vcfR<-vcfR[snpdepth < maxdepth,]
      
      return(vcfR)
    }
}

#define function to visualize missing data
missing.by.sample <- function(vcfR, popmap=NULL, cutoff=NULL){
        
  #extract depth from the vcf
  dp<- extract.gt(vcfR, element='DP', as.numeric=TRUE)
  
        if (is.null(cutoff)){

            if (is.null(popmap)) {
                print("No popmap provided")
                } 
            else {
                #calculate missingness by pop here and make dotplot
                  #popmap must be a two column dataframe with 'id' and 'pop' columns
                  #id's must match the ids in the vcf file
                  #calculate missingness by individual
                  miss<-colSums(is.na(dp))/nrow(dp)
                  #calculate avg depth by individual
                  avg.depth<-colMeans(dp, na.rm = TRUE)
                  #store ordered column names
                  samples<-colnames(dp)
                  #create df
                  df.x<-data.frame(id=samples,missingness=miss,avg.depth=avg.depth, row.names = NULL)
                  #bring in popmap
                  df.f<-merge(df.x, popmap, by="id")
                  #plot missingness and depth by pop
                  plot1<-ggplot(df.f, aes(x= reorder(pop, -missingness), y=missingness)) + 
                    geom_violin()+ theme_classic()+
                    theme(axis.title.x = element_blank(), axis.text.x = element_text(angle = 60, hjust = 1))+
                    ylab("proportion missing")+
                    geom_dotplot(binaxis='y', stackdir='center', dotsize=.8)
                  #dotplot avg depth of sequencing       
                  plot2<-ggplot(df.f, aes(x= reorder(pop, -missingness), y=avg.depth)) + 
                    geom_violin()+ theme_classic()+
                    theme(axis.title.x = element_blank(), axis.text.x = element_text(angle = 60, hjust = 1))+
                    ylab("average depth")+
                    geom_dotplot(binaxis='y', stackdir='center', dotsize=.8)
                  
                  #
                  grid.arrange(plot1,plot2)
                  
                }
      

    #initialize dataframe
    df.y<- data.frame(indiv=character(), snps.retained=numeric(), filt=numeric())
    #loop
    for (i in c(.5,.6,.7,.8,.9,1)){
      #get vector of individuals we are looping over
      indiv<-colnames(dp)
      #calculate the completeness cutoff for each snp to be retained in this iteration
      filt<-rep(i, times =ncol(dp))
      #calculate the number of loci successfully genotyped in each sample in this iteration
      snps.retained<-colSums(is.na(dp[(rowSums(is.na(dp))/ncol(dp) <= 1-i),]) == "FALSE")
      #append the data from this iteration to existing df
      df.y<-rbind(df.y, as.data.frame(cbind(indiv, filt, snps.retained)))
      #close for loop
    }
    
    #make columns numeric for plotting
    df.y$filt<-as.numeric(as.character(df.y$filt))
    df.y$snps.retained<-as.numeric(as.character(df.y$snps.retained))
    #visualize color coded by individual
    print( 
      ggplot(df.y, aes(x=filt, y=snps.retained, color=indiv))+
        geom_point()+
        ggtitle("SNPs retained by filtering scheme") +
        xlab("fraction of non-missing genotypes required to retain each SNP (0-1)") + ylab("non-missing SNPs retained per sample")+
        theme_light()+
        theme(legend.position="none")
    )
    
    #calculate missingness by individual
    miss<-colSums(is.na(dp))/nrow(dp)
    #show plot with missingness by sample
    dotchart(sort(miss), cex=.5, xlab = "proportion missing data")
    abline(v=c(.5,.6,.7,.8,.9,1), lty="dashed")
    
    #return the dataframe showing the missingness and avg depth per individual
    return(df.x)
    
      }
  else {
    
    #calculate missingness by individual
    miss<-colSums(is.na(dp))/nrow(dp)
    #vis plot to show where cutoff was set
    dotchart(sort(miss), cex=.5)
    abline(v=cutoff, col="red")
    
    #print
    print(paste0(length(labels(miss)[miss > cutoff])," samples fall below ",cutoff," missing data cutoff, and were removed from VCF"))
    #drop those individuals from vcfR
    vcfR@gt <- vcfR@gt[,!(colnames(vcfR@gt) %in% labels(miss)[miss > cutoff])]

    return(vcfR)
  }
    
}

#define function
missing.by.snp <- function(vcfR, cutoff=NULL){
  
  if (!is.null(cutoff)) {
    
    #do basic vis to show cutoff
    #extract genotype from the vcf
    dp.matrix<- extract.gt(vcfR, as.numeric=TRUE)

    #calculate the proportion of individuals successfully genotyped at each SNP
    miss<-rowSums(is.na(dp.matrix))/ncol(dp.matrix)
    
    #loop that stores a vector of # non-missing loci retained for each individual
    #looped over all possible completeness filter values
    #initialize df.x
    df.x<- data.frame(filt=numeric(), missingness=numeric(), snps.retained=numeric())
    #loop
    for (i in c(.3,.5,.6,.65,.7,.75,.8,.85,.9,.95,1)){
      #subset the matrix to only snps passing a given filter level
      gen.z.mat<-dp.matrix[1-miss >= i,]
      #calc the total number of snps retained at the given filter level
      snps.retained<-nrow(gen.z.mat)
      #calculate the total missing data % for the given cutoff
      missingness<-sum(is.na(gen.z.mat))/(ncol(gen.z.mat)*nrow(gen.z.mat))
      #calculate the completeness cutoff for each snp to be retained
      filt<-i
      df.x<-rbind(df.x, as.data.frame(cbind(filt, missingness, snps.retained)))
      }
    
    #make columns numeric for plotting
    df.x$filt<-as.numeric(as.character(df.x$filt))
    df.x$missingness<-as.numeric(as.character(df.x$missingness))
    df.x$snps.retained<-as.numeric(as.character(df.x$snps.retained))
    
    #visualize dotplot for total loci retained at each filter level
    plot1<-ggplot(df.x, aes(x=filt)) +
      scale_x_continuous(breaks=c(.3,.5,.6,.65,.7,.75,.8,.85,.9,.95,1))+
      geom_point(aes(y=snps.retained)) +
      theme_bw() +
      labs(x = "SNP completeness cutoff", y = "total loci retained")+
      geom_vline(xintercept = cutoff, color = "red")
    
    #visualize dotplot for missing data % at each filter level
    plot2<-ggplot(df.x, aes(x=filt)) +
      scale_x_continuous(breaks=c(.3,.5,.6,.65,.7,.75,.8,.85,.9,.95,1))+
      geom_point(aes(y=missingness)) +
      theme_bw() +
      labs(x = "SNP completeness cutoff", y = "total % missing data")+
      geom_vline(xintercept = cutoff, color = "red")
    
    grid.arrange(plot1,plot2)
    
    #calc # of SNPs filtered
    p<-round(sum(miss > 1-cutoff)/length(miss)*100, 2)
    
    #report # of SNPs filtered
    print(paste0(p,"% of SNPs fell below a completeness cutoff of ", cutoff, " and were removed from the VCF"))
    
    #do filtering
    vcfR <- vcfR[miss <= 1-cutoff, ]
    
    return(vcfR)
    
  } 
  else {
    
    ###Part 1
    #Vis to understand the interplay between retaining samples and setting a missing data cutoff
    #extract genotype from the vcf
    dp.matrix<- extract.gt(vcfR, as.numeric=TRUE)
    
    #calculate vector containing the proportion of individuals successfully genotyped at each SNP
    miss<-rowSums(is.na(dp.matrix))/ncol(dp.matrix)
    #initialize df.x
    df.y<- data.frame(filt=numeric(), snps=numeric())
    #loop
    for (i in c(.3,.5,.6,.65,.7,.75,.8,.85,.9,.95,1)){
      #subset matrix to only SNPs retained at given filter level
      gen.z.mat<-dp.matrix[1-miss >= i,]
      #calc the total number of snps retained at the given filter level in each sample
      snps<-colSums(is.na(gen.z.mat) == TRUE)/nrow(gen.z.mat)
      #calculate the completeness cutoff for each snp to be retained
      filt<-rep(i, times= length(snps))
      df.y<-rbind(df.y, as.data.frame(cbind(filt, snps)))
      }
    
    #make columns numeric for plotting
    df.y$filt<-as.numeric(as.character(df.y$filt))
    df.y$snps<-as.numeric(as.character(df.y$snps))
    
    #visualize filtering levels as stacked histograms
    print(
      ggplot(df.y, aes(x = snps, y = as.character(filt), fill = filt, color = filt)) +
        geom_density_ridges(jittered_points = TRUE, position = "raincloud", alpha = .25, cex=.5) +
        theme_classic() +
        labs(x = "missing data proportion in each sample", y = "SNP completeness cutoff") +
        theme(legend.position = "none")
      )
    
    ###Part 2
    #Vis to make decision on cutoff
    #calculate the proportion of individuals successfully genotyped at each SNP
    miss<-rowSums(is.na(dp.matrix))/ncol(dp.matrix)
    
    #loop that stores a vector of # non-missing loci retained for each individual
    #looped over all possible completeness filter values
    #initialize df.x
    df.x<- data.frame(filt=numeric(), missingness=numeric(), snps.retained=numeric())
    #loop
    for (i in c(.3,.5,.6,.65,.7,.75,.8,.85,.9,.95,1)){
      #subset the matrix to only snps passing a given filter level
      gen.z.mat<-dp.matrix[1-miss >= i,]
      #calc the total number of snps retained at the given filter level
      snps.retained<-nrow(gen.z.mat)
      #calculate the total missing data % for the given cutoff
      missingness<-sum(is.na(gen.z.mat))/(ncol(gen.z.mat)*nrow(gen.z.mat))
      #calculate the completeness cutoff for each snp to be retained
      filt<-i
      df.x<-rbind(df.x, as.data.frame(cbind(filt, missingness, snps.retained)))
      }
    
    #make columns numeric for plotting
    df.x$filt<-as.numeric(as.character(df.x$filt))
    df.x$missingness<-as.numeric(as.character(df.x$missingness))
    df.x$snps.retained<-as.numeric(as.character(df.x$snps.retained))
    
    #visualize dotplot for total loci retained at each filter level
    plot1<-ggplot(df.x, aes(x=filt)) +
      scale_x_continuous(breaks=c(.3,.5,.6,.65,.7,.75,.8,.85,.9,.95,1))+
      geom_point(aes(y=snps.retained)) +
      theme_bw() +
      labs(x = "SNP completeness cutoff", y = "total loci retained")

    #visualize dotplot for missing data % at each filter level
    plot2<-ggplot(df.x, aes(x=filt)) +
      scale_x_continuous(breaks=c(.3,.5,.6,.65,.7,.75,.8,.85,.9,.95,1))+
      geom_point(aes(y=missingness)) +
      theme_bw() +
      labs(x = "SNP completeness cutoff", y = "total proportion missing data")

    grid.arrange(plot1,plot2)
  }
  
}

#define function
min.mac <- function(vcfR, popmap=NULL, min.mac=NULL){
  
  if (is.null(min.mac)) {
    
    #convert vcfR to matrix and make numeric  
    gt.matrix<-extract.gt(vcfR)
    gt.matrix[gt.matrix == "0/0"]<-0
    gt.matrix[gt.matrix == "0/1"]<-1
    gt.matrix[gt.matrix == "1/1"]<-2
    class(gt.matrix) <- "numeric"
    
    #calc sfs
    sfs<-rowSums(gt.matrix, na.rm = TRUE)
    #fold sfs
    for (i in 1:length(sfs)) {
      if (sfs[i] <= sum(!is.na(gt.matrix[i,]))){}
      else {
        sfs[i]<-(sum(!is.na(gt.matrix[i,]))*2 - sfs[i])
      }
    }
    
    #hist folded mac with cutoff shown
    hist(sfs, main="folded SFS", xlab = "MAC")
    
    #convert vcfR into genlight
    genlight<-vcfR2genlight(vcfR)

    #run dapc for mac 1,2,3,4,5,10
    for (i in c(1,2,3,4,5,10)){
      
      #filter genlight by given mac
      genlight<-genlight[,sfs >= i]
      #subset sfs vector to only samples left in the vcf
      sfs<-sfs[sfs >= i]
      
      #assign samples to the number of groups present in popmap, retain all PCAs
      grp<-find.clusters(genlight, n.pca = ncol(gt.matrix)-1, n.clust = length(levels(popmap$pop)))
      
      #check how well that assignment matched up to the provided popmap
      samps<-merge(popmap, data.frame(group=grp$grp, id=labels(grp$grp)), by='id')
      print(paste0("for ", i, " minimum MAC cutoff, compare k means clustering to popmap assignment"))
      print(table(samps$pop, samps$group))
      
      #run dapc, retain all discriminant axes, and enough PC axes to explain 75% of variance
      dapc1<-dapc(genlight, grp$grp, n.da = length(levels(popmap$pop))-1, pca.select = "percVar", perc.pca = 75)
      
      #plot compoplot
      compoplot(dapc1, legend=FALSE, col=funky(length(levels(popmap$pop))), show.lab =TRUE, cex.names=.4, main=paste0("min. MAC ",i,", total SNPs ",length(sfs)))
      
      #print
      print(paste0("DAPC with min. MAC ", i, " and ", length(sfs), " total SNPs, complete"))
    }
    
  } 
    else {
      
    #convert vcfR to matrix and make numeric  
    gt.matrix<-extract.gt(vcfR)
    gt.matrix[gt.matrix == "0/0"]<-0
    gt.matrix[gt.matrix == "0/1"]<-1
    gt.matrix[gt.matrix == "1/1"]<-2
    class(gt.matrix) <- "numeric"
    
    #calc sfs
    sfs<-rowSums(gt.matrix, na.rm = TRUE)
    #fold sfs
    for (i in 1:length(sfs)) {
      if (sfs[i] <= sum(!is.na(gt.matrix[i,]))){}
      else {
        sfs[i]<-(sum(!is.na(gt.matrix[i,]))*2 - sfs[i])
      }
    }
    
    #hist folded mac with cutoff shown
    hist(sfs, main="folded SFS", xlab = "MAC")
    abline(v=min.mac-1, col="red")
    
    #calculate % of SNPs to be removed, and print it
    p<-round((sum(sfs < min.mac)/length(sfs))*100, 2)
    print(paste0(p, "% of SNPs fell below a minor allele count of ", min.mac, " and were removed from the VCF"))
    
    #filter vcfR
    vcfR <- vcfR[sfs >= min.mac,]
    
    return(vcfR)

    }
  
}

Read in your unfiltered vcf file

#read in vcf as vcfR
vcfR <- read.vcfR("~/Desktop/aph.data/populations.snps.vcf")
## Scanning file to determine attributes.
## File attributes:
##   meta lines: 14
##   header_line: 15
##   variant count: 210336
##   column count: 124
## 
Meta line 14 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 210336
##   Character matrix gt cols: 124
##   skip: 0
##   nrows: 210336
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant 18000
Processed variant 19000
Processed variant 20000
Processed variant 21000
Processed variant 22000
Processed variant 23000
Processed variant 24000
Processed variant 25000
Processed variant 26000
Processed variant 27000
Processed variant 28000
Processed variant 29000
Processed variant 30000
Processed variant 31000
Processed variant 32000
Processed variant 33000
Processed variant 34000
Processed variant 35000
Processed variant 36000
Processed variant 37000
Processed variant 38000
Processed variant 39000
Processed variant 40000
Processed variant 41000
Processed variant 42000
Processed variant 43000
Processed variant 44000
Processed variant 45000
Processed variant 46000
Processed variant 47000
Processed variant 48000
Processed variant 49000
Processed variant 50000
Processed variant 51000
Processed variant 52000
Processed variant 53000
Processed variant 54000
Processed variant 55000
Processed variant 56000
Processed variant 57000
Processed variant 58000
Processed variant 59000
Processed variant 60000
Processed variant 61000
Processed variant 62000
Processed variant 63000
Processed variant 64000
Processed variant 65000
Processed variant 66000
Processed variant 67000
Processed variant 68000
Processed variant 69000
Processed variant 70000
Processed variant 71000
Processed variant 72000
Processed variant 73000
Processed variant 74000
Processed variant 75000
Processed variant 76000
Processed variant 77000
Processed variant 78000
Processed variant 79000
Processed variant 80000
Processed variant 81000
Processed variant 82000
Processed variant 83000
Processed variant 84000
Processed variant 85000
Processed variant 86000
Processed variant 87000
Processed variant 88000
Processed variant 89000
Processed variant 90000
Processed variant 91000
Processed variant 92000
Processed variant 93000
Processed variant 94000
Processed variant 95000
Processed variant 96000
Processed variant 97000
Processed variant 98000
Processed variant 99000
Processed variant 100000
Processed variant 101000
Processed variant 102000
Processed variant 103000
Processed variant 104000
Processed variant 105000
Processed variant 106000
Processed variant 107000
Processed variant 108000
Processed variant 109000
Processed variant 110000
Processed variant 111000
Processed variant 112000
Processed variant 113000
Processed variant 114000
Processed variant 115000
Processed variant 116000
Processed variant 117000
Processed variant 118000
Processed variant 119000
Processed variant 120000
Processed variant 121000
Processed variant 122000
Processed variant 123000
Processed variant 124000
Processed variant 125000
Processed variant 126000
Processed variant 127000
Processed variant 128000
Processed variant 129000
Processed variant 130000
Processed variant 131000
Processed variant 132000
Processed variant 133000
Processed variant 134000
Processed variant 135000
Processed variant 136000
Processed variant 137000
Processed variant 138000
Processed variant 139000
Processed variant 140000
Processed variant 141000
Processed variant 142000
Processed variant 143000
Processed variant 144000
Processed variant 145000
Processed variant 146000
Processed variant 147000
Processed variant 148000
Processed variant 149000
Processed variant 150000
Processed variant 151000
Processed variant 152000
Processed variant 153000
Processed variant 154000
Processed variant 155000
Processed variant 156000
Processed variant 157000
Processed variant 158000
Processed variant 159000
Processed variant 160000
Processed variant 161000
Processed variant 162000
Processed variant 163000
Processed variant 164000
Processed variant 165000
Processed variant 166000
Processed variant 167000
Processed variant 168000
Processed variant 169000
Processed variant 170000
Processed variant 171000
Processed variant 172000
Processed variant 173000
Processed variant 174000
Processed variant 175000
Processed variant 176000
Processed variant 177000
Processed variant 178000
Processed variant 179000
Processed variant 180000
Processed variant 181000
Processed variant 182000
Processed variant 183000
Processed variant 184000
Processed variant 185000
Processed variant 186000
Processed variant 187000
Processed variant 188000
Processed variant 189000
Processed variant 190000
Processed variant 191000
Processed variant 192000
Processed variant 193000
Processed variant 194000
Processed variant 195000
Processed variant 196000
Processed variant 197000
Processed variant 198000
Processed variant 199000
Processed variant 200000
Processed variant 201000
Processed variant 202000
Processed variant 203000
Processed variant 204000
Processed variant 205000
Processed variant 206000
Processed variant 207000
Processed variant 208000
Processed variant 209000
Processed variant 210000
Processed variant: 210336
## All variants processed
### check the metadata present in your vcf
vcfR
## ***** Object of Class vcfR *****
## 115 samples
## 87 CHROMs
## 210,336 variants
## Object size: 685.5 Mb
## 57.1 percent missing data
## *****        *****         *****
vcfR@fix[1:10,1:8]
##       CHROM        POS      ID        REF ALT QUAL FILTER INFO            
##  [1,] "Pseudochr1" "615693" "48:25:+" "T" "C" NA   "PASS" "NS=3;AF=0.333" 
##  [2,] "Pseudochr1" "615724" "48:56:+" "C" "T" NA   "PASS" "NS=3;AF=0.333" 
##  [3,] "Pseudochr1" "674543" "56:43:+" "A" "T" NA   "PASS" "NS=9;AF=0.222" 
##  [4,] "Pseudochr1" "674545" "56:45:+" "A" "T" NA   "PASS" "NS=18;AF=0.222"
##  [5,] "Pseudochr1" "674596" "56:96:+" "A" "T" NA   "PASS" "NS=14;AF=0.250"
##  [6,] "Pseudochr1" "690980" "61:7:+"  "G" "A" NA   "PASS" "NS=10;AF=0.100"
##  [7,] "Pseudochr1" "690991" "61:18:+" "T" "C" NA   "PASS" "NS=10;AF=0.300"
##  [8,] "Pseudochr1" "691006" "61:33:+" "T" "A" NA   "PASS" "NS=10;AF=0.300"
##  [9,] "Pseudochr1" "690931" "62:45:-" "G" "A" NA   "PASS" "NS=4;AF=0.250" 
## [10,] "Pseudochr1" "690906" "62:70:-" "A" "G" NA   "PASS" "NS=4;AF=0.250"
vcfR@gt[1:10,1:2]
##       FORMAT           A_californica_333849                
##  [1,] "GT:DP:AD:GQ:GL" NA                                  
##  [2,] "GT:DP:AD:GQ:GL" NA                                  
##  [3,] "GT:DP:AD:GQ:GL" NA                                  
##  [4,] "GT:DP:AD:GQ:GL" "1/1:67:0,67:40:-256.40,-21.43,0.00"
##  [5,] "GT:DP:AD:GQ:GL" "1/1:67:0,67:40:-226.28,-20.45,0.00"
##  [6,] "GT:DP:AD:GQ:GL" "0/0:2:2,0:32:-0.00,-2.62,-6.23"    
##  [7,] "GT:DP:AD:GQ:GL" "1/1:2:0,2:27:-4.96,-2.15,-0.00"    
##  [8,] "GT:DP:AD:GQ:GL" "0/0:2:2,0:31:-0.00,-2.51,-5.68"    
##  [9,] "GT:DP:AD:GQ:GL" NA                                  
## [10,] "GT:DP:AD:GQ:GL" NA
#generate popmap file. Two column popmap with the same format as stacks, and the columns must be named 'id' and 'pop'
#popmap<-data.frame(id=colnames(vcfR@gt)[2:length(colnames(vcfR@gt))],pop=substr(colnames(vcfR@gt)[2:length(colnames(vcfR@gt))], 3,11))
#read in locality info for samples
locs<-read.csv("~/Desktop/aph.data/rad.sampling.csv")
popmap<-data.frame(id=locs$id, pop=locs$species)
#rename mislabeled sample
colnames(vcfR@gt)[colnames(vcfR@gt) == "A_californica_334171"]<-"A_woodhouseii_334171"

step 1: Implement quality filters that don't involve missing data. This is because removing low data samples will alter percentage/quantile based missing data cutoffs, so we wait to implement those until after deciding on our final set of samples for downstream analysis

Note:If you have a very large vcf output file that you would like to work with in Rstudio, you could implement some percentage based filters (e.g. remove all SNPs above 50% missing data) via vcftools to reduce your filesize before starting to filter in R. Just be aware that once you drop low data samples, your previously enforced (per SNP or locus) missing data % will no longer be exact.

Jon Puritz has an excellent filtering tutorial that is focused specifically on filtering RADseq data: https://www.ddocent.com/filtering/ Some of these RAD specific filters can't be applied to a vcf output by stacks, which doesn't retain metadata like mapping quality and strand, but we can follow these guidelines for hard filtering (he suggests minimum depth=3, gq =30), and can implement suggested filters like allele balance and max depth, here in R.

#hard filter to minimum depth of 5, and minimum genotype quality of 30
vcfR<-hard.filter.vcf(vcfR=vcfR, depth = 5, gq = 30)
## [1] "32.92% of genotypes fall below a read depth of 5 and were converted to NA"
## [1] "2.01% of genotypes fall below a genotype quality of 30 and were converted to NA"

Use this function to filter for allele balance from Puritz SNP filtering tutorial "Allele balance: a number between 0 and 1 representing the ratio of reads showing the reference allele to all reads, considering only reads from individuals called as heterozygous, we expect that the allele balance in our data (for real loci) should be close to 0.5"

#execute allele balance filter
vcfR<-filter.allele.balance(vcfR)
## [1] "7.56% of het genotypes (0.39% of all genotypes) fall outside of .25 - .75 allele balance and were converted to NA"

max depth filter (super high depth loci are likely multiple loci stuck together into a single paralogous locus). Note: we want to apply this 'per SNP' rather than 'per genotype' otherwise we will simply be removing most of the genotypes from our deepest sequenced samples (because sequencing depth is so variable between samples)

#visualize and pick appropriate max depth cutoff
max.depth(vcfR)

## [1] "dashed line indicates a mean depth across all SNPs of 46.7"
#filter vcf by the max depth cutoff you chose
vcfR<-max.depth(vcfR, maxdepth = 100)

## [1] "12.85% of SNPs were above a mean depth of 100 and were removed from the vcf"

Note: It may be a good idea to additionally filter out SNPs that are significantly out of HWE if you have a really good idea of what the population structure in your sample looks like and good sample sizes in each pop. For this dataset, which is highly structured (many isolated island pops) with species boundaries that are in flux, I am not going to use a HWE filter, because I don't feel comfortable confidently identifying populations in which we can expect HWE.

#check vcfR to see how many SNPs we have left
vcfR
## ***** Object of Class vcfR *****
## 115 samples
## 78 CHROMs
## 183,293 variants
## Object size: 408 Mb
## 79.79 percent missing data
## *****        *****         *****

Step 2: visualize missing data by sample. Check out the visualizations and make decision on which samples to keep for downstream analysis.

Determining which samples to retain is highly project specific, and is contingent on your sampling, your taxa, the sequencing results, etc. It is also wise to take missing data into account by population. Often we don't know a-priori what our populations will look like, but in general we want to avoid making inferences about populations if they have consistently less information than the rest of our dataset. On the flip side of that coin, you may have designed a project to get information about a rare sample/population that lacks existing genetic sampling, and therefore must retain specific samples despite low sequencing and high missing data. This is a reality, and simply needs to be addressed carefully.

#run function to visualize samples
missing.by.sample(vcfR=vcfR, popmap = popmap)
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

##                        id missingness  avg.depth
## 1    A_californica_333849   0.6237445  51.464757
## 2    A_californica_333854   0.7526147  22.655390
## 3    A_californica_333855   0.7810773  19.995439
## 4    A_californica_333857   0.6242846  54.910304
## 5    A_californica_333860   0.7300879  25.357346
## 6    A_californica_333862   0.7610765  42.624415
## 7    A_californica_333868   0.9944406   6.135427
## 8    A_californica_333871   0.8656359  20.685358
## 9    A_californica_333873   0.9036570  15.119769
## 10   A_californica_333874   0.7854801  34.143795
## 11   A_californica_333914   0.8695749  12.709529
## 12   A_californica_333917   0.7296187  25.804859
## 13   A_californica_333925   0.9999836   6.333333
## 14   A_californica_333931   0.7794133  19.883063
## 15   A_californica_333932   0.7316864  25.908276
## 16   A_californica_333934   0.7124713  27.219233
## 17   A_californica_334002   0.6452947  38.650727
## 18   A_californica_334012   0.5646479  69.190671
## 19   A_californica_334015   0.6650881  34.004333
## 20   A_californica_334017   0.7889008  18.560463
## 21   A_californica_334019   0.8452259  13.874158
## 22   A_woodhouseii_334171   0.7022309  20.210301
## 23   A_californica_342037   0.8231793  15.116476
## 24   A_californica_342048   0.7382988  25.118058
## 25   A_californica_342050   0.5380947  84.986063
## 26   A_californica_342051   0.9957227   6.121173
## 27   A_californica_342066   0.7087286  30.566288
## 28   A_californica_342072   0.8688821  12.858195
## 29   A_californica_343428   0.7787368  36.514079
## 30   A_californica_343430   0.8203314  15.783190
## 31   A_californica_343432   0.8257053  15.501894
## 32   A_californica_343438   0.8606766  12.859733
## 33   A_californica_343442   0.9766712   6.938026
## 34   A_californica_343451   0.8599674  11.511240
## 35   A_californica_393615   0.8548772  20.423346
## 36   A_californica_393616   0.9315522   9.482225
## 37   A_californica_393721   0.7879406  39.580668
## 38  A_coerulescens_396251   0.6963878  37.259209
## 39  A_coerulescens_396254   0.7661504  23.264331
## 40  A_coerulescens_396256   0.8496451  14.468159
## 41  A_coerulescens_396259   0.9686513   7.441351
## 42  A_coerulescens_396262   0.9999945   7.000000
## 43  A_coerulescens_396263   0.8836071  11.647136
## 44  A_coerulescens_396264   0.6583776  42.406982
## 45  A_coerulescens_396265   0.6423322  52.268266
## 46     A_insularis_334025   0.7548788  22.905807
## 47     A_insularis_334029   0.7657248  22.650986
## 48     A_insularis_334031   0.6734409  40.321555
## 49     A_insularis_334032   0.6127621  54.369368
## 50     A_insularis_334033   0.5983425  57.727714
## 51     A_insularis_334034   0.6917613  36.228203
## 52     A_insularis_334037   0.7042986  31.126734
## 53     A_insularis_334038   0.8724992  12.577493
## 54   A_sumichrasti_343502   0.7825285  19.362510
## 55   A_sumichrasti_343503   0.5148696 112.849507
## 56   A_sumichrasti_343510   0.8146028  32.333029
## 57   A_sumichrasti_343511   0.8546098  23.564261
## 58   A_sumichrasti_343512   0.6009613  53.978603
## 59   A_sumichrasti_343513   0.9441604   9.241036
## 60   A_sumichrasti_343514   0.8126170  15.732982
## 61   A_sumichrasti_343515   0.8013781  17.450063
## 62   A_sumichrasti_393633   0.9739925   7.262639
## 63   A_sumichrasti_393635   0.8023165  17.513965
## 64   A_sumichrasti_393636   0.9226321   8.988083
## 65   A_sumichrasti_393637   0.9835618   7.444076
## 66   A_sumichrasti_393638   0.8484394  22.848164
## 67   A_sumichrasti_393639   0.9915054   7.447656
## 68   A_sumichrasti_393640   0.7470935  24.243593
## 69   A_woodhouseii_334059   0.9996508   5.234375
## 70   A_woodhouseii_334062   0.8287005  16.033346
## 71   A_woodhouseii_334063   0.7889117  20.056861
## 72   A_woodhouseii_334086   0.8162396  17.940146
## 73   A_woodhouseii_334088   0.7953986  18.089142
## 74   A_woodhouseii_334096   0.8918889  10.343208
## 75   A_woodhouseii_334132   0.7262416  26.334669
## 76   A_woodhouseii_334133   0.7055643  31.875537
## 77   A_woodhouseii_334134   0.8930401  11.360571
## 78   A_woodhouseii_334142   0.7135679  30.351517
## 79   A_woodhouseii_334148   0.9255727   9.471558
## 80   A_woodhouseii_334153   0.7827358  19.428622
## 81   A_woodhouseii_334156   0.9264456  12.825323
## 82   A_woodhouseii_334161   0.5679322  72.232590
## 83   A_woodhouseii_334170   0.9462882   9.909192
## 84   A_woodhouseii_334172   0.9996999   6.927273
## 85   A_woodhouseii_334188   0.8253779  15.382666
## 86   A_woodhouseii_334190   0.7395209  25.553012
## 87   A_woodhouseii_334196   0.7022309  32.123417
## 88   A_woodhouseii_334210   0.6947292  33.003199
## 89   A_woodhouseii_334211   0.6853290  35.871665
## 90   A_woodhouseii_334217   0.8945077   9.695542
## 91   A_woodhouseii_334230   0.7581632  23.006633
## 92   A_woodhouseii_334240   0.9967156   5.918605
## 93   A_woodhouseii_334241   0.8453787  14.164603
## 94   A_woodhouseii_334242   0.8479047  13.683442
## 95   A_woodhouseii_334243   0.8161850  15.518432
## 96   A_woodhouseii_334244   0.6644880  38.046279
## 97   A_woodhouseii_334247   0.8519420  13.999005
## 98   A_woodhouseii_334250   0.6729499  33.929053
## 99   A_woodhouseii_343453   0.9800974   6.711897
## 100  A_woodhouseii_343458   0.8653140  12.249808
## 101  A_woodhouseii_343461   0.6582466  44.578774
## 102  A_woodhouseii_343476   0.7542787  22.983481
## 103  A_woodhouseii_343480   0.8583743  19.734543
## 104  A_woodhouseii_343481   0.8229283  25.881162
## 105  A_woodhouseii_343483   0.7033384  50.145726
## 106  A_woodhouseii_343497   0.8209642  14.082094
## 107  A_woodhouseii_393605   0.7116366  50.444121
## 108  A_woodhouseii_393606   0.7674161  22.199690
## 109  A_woodhouseii_393697   0.8169488  29.873838
## 110  A_woodhouseii_393698   0.8998380  15.307696
## 111  A_woodhouseii_393699   0.9781988   7.480480
## 112  A_woodhouseii_393702   0.8096436  31.629016
## 113  A_woodhouseii_393712   0.7190182  47.964429
## 114  A_woodhouseii_393713   0.8812339  17.507097
## 115  A_woodhouseii_395768   0.6519289  75.294942
#run function to drop samples above the threshold we want from the vcf
vcfR<-missing.by.sample(vcfR=vcfR, cutoff = .91)

## [1] "20 samples fall below 0.91 missing data cutoff, and were removed from VCF"
#subset popmap to only include retained individuals
popmap<-popmap[popmap$id %in% colnames(vcfR@gt),]

#alternatively, you can drop individuals from vcfR manually using the following syntax, if a strict cutoff doesn't work for your dataset
#vcfR@gt <- vcfR@gt[,colnames(vcfR@gt) != "E_trichroa_4702" & colnames(vcfR@gt) != "E_trichroa_27882"]

Step 3: Set the arbitrary missing data cutoff

We can visualize the effect that typical missing data cutoffs will have on both the number of SNPs retained and the total missing data in our entire dataset. We want to choose a cutoff that minimizes the overall missing data in the dataset, while maximizing the total number of loci retained. Note: This will also depend on the samples that we decided to include above. A good rule of thumb is that samples shouldn't be above 50% missing data after applying this cutoff. So if we are retaining low data samples out of necessity or project design, we may have to set a more stringent cutoff at the expense of total SNPs retained for downstream analyses.

#visualize missing data by SNP and the effect of various cutoffs on the missingness of each sample
missing.by.snp(vcfR)
## Picking joint bandwidth of 0.0654

#choose a value that retains an acceptable amount of missing data in each sample, and maximizes SNPs retained while minimizing overall missing data, and filter vcf
vcfR<-missing.by.snp(vcfR, cutoff = .85)

## [1] "90.68% of SNPs fell below a completeness cutoff of 0.85 and were removed from the VCF"

Step 4: investigate the effect of a minor allele frequency cutoff on our downstream inferences.

MAF cutoffs can be helpful in removing spurious and uninformative loci from the dataset, but also have the potential to bias downstream inferences. Linck and Battey (2019) have an excellent paper on just this topic. From the paper-

"We recommend researchers using model‐based programs to describe population structure observe the following best practices: (a) duplicate analyses with nonparametric methods suchas PCA and DAPC with cross validation (b) exclude singletons (c) compare alignments with multiple assembly parameters When seeking to exclude only singletons in alignments with missing data (a ubiquitous problem for reduced‐representation library preparation methods), it is preferable to filter by the count (rather than frequency) of the minor allele, because variation in the amount of missing data across an alignment will cause a static frequency cutoff to remove different SFS classes at different sites""

Our package contains a convenient wrapper functions that streamline the investigation of different MAF cutoffs. Warning: this is a heavy function if it is run without the 'min.mac' option. It is converting the entire vcf and running 6 unique dapc iterations, which will take time for large datasets. If you set 'min.mac' it will just filter your dataset, and should run quickly.

#use min.mac() to investigate the effect of multiple cutoffs
min.mac(vcfR = vcfR, popmap = popmap)

## [1] "for 1 minimum MAC cutoff, compare k means clustering to popmap assignment"
##               
##                 1  2  3  4  5
##   californica   0  0 31  0  0
##   coerulescens  0  0  0  6  0
##   insularis     0  0  0  0  8
##   sumichrasti   0 10  0  0  0
##   woodhouseii  39  0  0  0  0

## [1] "DAPC with min. MAC 1 and 16307 total SNPs, complete"
## [1] "for 2 minimum MAC cutoff, compare k means clustering to popmap assignment"
##               
##                 1  2  3  4  5
##   californica   0  0  0  0 31
##   coerulescens  0  0  6  0  0
##   insularis     8  0  0  0  0
##   sumichrasti   0 10  0  0  0
##   woodhouseii   0  0  0 39  0

## [1] "DAPC with min. MAC 2 and 11273 total SNPs, complete"
## [1] "for 3 minimum MAC cutoff, compare k means clustering to popmap assignment"
##               
##                 1  2  3  4  5
##   californica   0 31  0  0  0
##   coerulescens  0  0  6  0  0
##   insularis     0  0  0  8  0
##   sumichrasti  10  0  0  0  0
##   woodhouseii   0  0  0  0 39

## [1] "DAPC with min. MAC 3 and 8582 total SNPs, complete"
## [1] "for 4 minimum MAC cutoff, compare k means clustering to popmap assignment"
##               
##                 1  2  3  4  5
##   californica   0  0 31  0  0
##   coerulescens  0  0  0  0  6
##   insularis     8  0  0  0  0
##   sumichrasti   0 10  0  0  0
##   woodhouseii   0  0  0 39  0

## [1] "DAPC with min. MAC 4 and 7107 total SNPs, complete"
## [1] "for 5 minimum MAC cutoff, compare k means clustering to popmap assignment"
##               
##                 1  2  3  4  5
##   californica   0  0  0 31  0
##   coerulescens  6  0  0  0  0
##   insularis     0  0  8  0  0
##   sumichrasti   0  0  0  0 10
##   woodhouseii   0 39  0  0  0

## [1] "DAPC with min. MAC 5 and 6064 total SNPs, complete"
## [1] "for 10 minimum MAC cutoff, compare k means clustering to popmap assignment"
##               
##                 1  2  3  4  5
##   californica   0 31  0  0  0
##   coerulescens  0  0  6  0  0
##   insularis     8  0  0  0  0
##   sumichrasti   0  0  0 10  0
##   woodhouseii   0  0  0  0 39

## [1] "DAPC with min. MAC 10 and 3574 total SNPs, complete"
#based on these visualizations, I will remove singletons from the dataset, as it doesn't affect group inference
vcfR<-min.mac(vcfR, min.mac = 2)

## [1] "34.02% of SNPs fell below a minor allele count of 2 and were removed from the VCF"

check once more to see how many SNPs and individuals remain compared to our original, unfiltered vcf

vcfR
## ***** Object of Class vcfR *****
## 95 samples
## 35 CHROMs
## 11,273 variants
## Object size: 78.6 Mb
## 5.84 percent missing data
## *****        *****         *****
#plot depth per snp and per sample
dp <- extract.gt(vcfR, element = "DP", as.numeric=TRUE)
heatmap.bp(dp, rlabels = FALSE)

#plot genotype quality per snp and per sample
gq <- extract.gt(vcfR, element = "GQ", as.numeric=TRUE)
heatmap.bp(gq, rlabels = FALSE)

We can use the convenient function 'write.vcf' from vcfR to export our filtered vcf file for downstream analyses

#write.vcf(vcfR, "~/Desktop/aph.data/filtered.vcf.gz")

If you need physically unlinked loci (which is a requirement of some programs, e.g. structure) this filtering step should always be done last, because it is not quality aware. Introducing a quality-blind linkage filter before doing the quality based filtering shown here would potentially remove quality SNPs while leaving us with only the low quality SNPs in a locus or genomic region. If filtering for linkage is needed, it can be done on our output vcf file with a simple one-liner via vcftools (set thin to whatever bp distance you assume linakge decays at in your study organism) vcftools --vcf vcf.vcf --thin 10000 --recode --out vcf